{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Class07"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import statsmodels.formula.api as sm\n",
    "from linearmodels import PanelOLS"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Merge CRSP and Compustat\n",
    "The unique identifier in CRSP and Compustat is **PERMNO** and **GVKEY** respectively. To merge the two databases, **CUSIP** is used. Both CRSP and Compustat use most recent CUSIP."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Read CRSP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "file_path = '/Users/ml/Google Drive/af/teaching/database/data/'\n",
    "msf_raw = pd.read_csv(file_path+'msf_1992_2017.txt',sep='\\t',low_memory=False)\n",
    "msf_raw.columns = msf_raw.columns.str.lower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "msf = msf_raw[(msf_raw['shrcd'].isin([10,11]))&(msf_raw['exchcd'].isin([1,2,3]))].copy()\n",
    "msf['ret'] = pd.to_numeric(msf['ret'],errors='coerce')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "msf = msf.sort_values(['permno','date']).reset_index(drop=True)\n",
    "msf = msf.drop_duplicates(['permno','date'])\n",
    "msf = msf.sort_values(['permno','date']).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "msf['yyyymm'] =(msf['date']/100).astype(int)\n",
    "msf['calyr'] = (msf['yyyymm']/100).astype(int)\n",
    "msf['month'] = (msf['yyyymm']%100).astype(int)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Match CRSP return data from July in year t to June in year t+1 with accounting data in year t-1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "msf['mergeyr'] = np.where((msf['month']>=7)&(msf['month']<=12),msf['calyr'],msf['calyr']-1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Read Compustat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "ag = pd.read_stata(file_path+'roa.dta')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "ag['calyr'] = (ag['datadate']/10000).astype(int)\n",
    "ag['mergeyr'] = ag['calyr'] + 1\n",
    "ag['cusip'] =ag['cusip'].str[:8]\n",
    "ag['ag'] = ag['at'] / ag['at_l1'] - 1\n",
    "p1 = ag['ag'].quantile(0.01)\n",
    "p99 = ag['ag'].quantile(0.99)\n",
    "ag = ag[(ag['ag']>=p1)&(ag['ag']<=p99)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Merge return with asset growth"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "msf_1 = msf.merge(ag[['cusip','mergeyr','ag']],how='inner',on=['cusip','mergeyr'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"simpletable\">\n",
       "<caption>OLS Regression Results</caption>\n",
       "<tr>\n",
       "  <th>Dep. Variable:</th>           <td>ret</td>       <th>  R-squared:         </th>  <td>   0.001</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Model:</th>                   <td>OLS</td>       <th>  Adj. R-squared:    </th>  <td>   0.001</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Method:</th>             <td>Least Squares</td>  <th>  F-statistic:       </th>  <td>   681.4</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Date:</th>             <td>Thu, 16 Aug 2018</td> <th>  Prob (F-statistic):</th>  <td>3.65e-150</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Time:</th>                 <td>15:52:36</td>     <th>  Log-Likelihood:    </th> <td>3.4377e+05</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>No. Observations:</th>      <td>1263251</td>     <th>  AIC:               </th> <td>-6.875e+05</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Residuals:</th>          <td>1263249</td>     <th>  BIC:               </th> <td>-6.875e+05</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Model:</th>              <td>     1</td>      <th>                     </th>      <td> </td>    \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Covariance Type:</th>      <td>nonrobust</td>    <th>                     </th>      <td> </td>    \n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "      <td></td>         <th>coef</th>     <th>std err</th>      <th>t</th>      <th>P>|t|</th>  <th>[0.025</th>    <th>0.975]</th>  \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Intercept</th> <td>    0.0137</td> <td>    0.000</td> <td>   80.166</td> <td> 0.000</td> <td>    0.013</td> <td>    0.014</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>ag</th>        <td>   -0.0046</td> <td>    0.000</td> <td>  -26.103</td> <td> 0.000</td> <td>   -0.005</td> <td>   -0.004</td>\n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "  <th>Omnibus:</th>       <td>1539007.208</td> <th>  Durbin-Watson:     </th>    <td>   2.027</td>   \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Prob(Omnibus):</th>   <td> 0.000</td>    <th>  Jarque-Bera (JB):  </th> <td>2176338402.768</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Skew:</th>            <td> 5.722</td>    <th>  Prob(JB):          </th>    <td>    0.00</td>   \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Kurtosis:</th>        <td>206.018</td>   <th>  Cond. No.          </th>    <td>    1.35</td>   \n",
       "</tr>\n",
       "</table><br/><br/>Warnings:<br/>[1] Standard Errors assume that the covariance matrix of the errors is correctly specified."
      ],
      "text/plain": [
       "<class 'statsmodels.iolib.summary.Summary'>\n",
       "\"\"\"\n",
       "                            OLS Regression Results                            \n",
       "==============================================================================\n",
       "Dep. Variable:                    ret   R-squared:                       0.001\n",
       "Model:                            OLS   Adj. R-squared:                  0.001\n",
       "Method:                 Least Squares   F-statistic:                     681.4\n",
       "Date:                Thu, 16 Aug 2018   Prob (F-statistic):          3.65e-150\n",
       "Time:                        15:52:36   Log-Likelihood:             3.4377e+05\n",
       "No. Observations:             1263251   AIC:                        -6.875e+05\n",
       "Df Residuals:                 1263249   BIC:                        -6.875e+05\n",
       "Df Model:                           1                                         \n",
       "Covariance Type:            nonrobust                                         \n",
       "==============================================================================\n",
       "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
       "------------------------------------------------------------------------------\n",
       "Intercept      0.0137      0.000     80.166      0.000       0.013       0.014\n",
       "ag            -0.0046      0.000    -26.103      0.000      -0.005      -0.004\n",
       "==============================================================================\n",
       "Omnibus:                  1539007.208   Durbin-Watson:                   2.027\n",
       "Prob(Omnibus):                  0.000   Jarque-Bera (JB):       2176338402.768\n",
       "Skew:                           5.722   Prob(JB):                         0.00\n",
       "Kurtosis:                     206.018   Cond. No.                         1.35\n",
       "==============================================================================\n",
       "\n",
       "Warnings:\n",
       "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
       "\"\"\""
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sm.ols('ret~ag',msf_1).fit().summary()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Merge CRSP and Thomson Reuters 13F\n",
    "CUSIP in Thomson Reuters 13F is histotical CUSIP which is the NCUSIP in CRSP, so to merge CRSP and 13F, we need match NCUSIP in CRSP with CUSIP in 13F. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Read 13F"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "tr13f = pd.read_csv(file_path+'tr13f.txt',sep='\\t',low_memory=False)\n",
    "tr13f.columns = tr13f.columns.str.lower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "tr13f = tr13f.drop_duplicates(['cusip','rdate'])\n",
    "tr13f = tr13f.rename(columns={'cusip':'ncusip'})\n",
    "tr13f = tr13f.sort_values(['ncusip','rdate']).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Match CRSP return data from July in year t to June in year t+1 with 13F data in June of year t"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "tr13f['mergeyr'] = (tr13f['rdate']/10000).astype(int)\n",
    "tr13f['month'] = (tr13f['rdate']/100).astype(int)%100\n",
    "tr13f = tr13f[tr13f['month']==6][['ncusip','mergeyr','numinstowners','instown_perc']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>permno</th>\n",
       "      <th>date</th>\n",
       "      <th>shrcd</th>\n",
       "      <th>exchcd</th>\n",
       "      <th>ncusip</th>\n",
       "      <th>cusip</th>\n",
       "      <th>ret</th>\n",
       "      <th>yyyymm</th>\n",
       "      <th>calyr</th>\n",
       "      <th>month</th>\n",
       "      <th>mergeyr</th>\n",
       "      <th>numinstowners</th>\n",
       "      <th>instown_perc</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10001</td>\n",
       "      <td>19920731</td>\n",
       "      <td>11.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>39040610</td>\n",
       "      <td>36720410</td>\n",
       "      <td>0.063830</td>\n",
       "      <td>199207</td>\n",
       "      <td>1992</td>\n",
       "      <td>7</td>\n",
       "      <td>1992</td>\n",
       "      <td>3</td>\n",
       "      <td>0.009185</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10001</td>\n",
       "      <td>19920831</td>\n",
       "      <td>11.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>39040610</td>\n",
       "      <td>36720410</td>\n",
       "      <td>0.040000</td>\n",
       "      <td>199208</td>\n",
       "      <td>1992</td>\n",
       "      <td>8</td>\n",
       "      <td>1992</td>\n",
       "      <td>3</td>\n",
       "      <td>0.009185</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10001</td>\n",
       "      <td>19920930</td>\n",
       "      <td>11.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>39040610</td>\n",
       "      <td>36720410</td>\n",
       "      <td>0.165962</td>\n",
       "      <td>199209</td>\n",
       "      <td>1992</td>\n",
       "      <td>9</td>\n",
       "      <td>1992</td>\n",
       "      <td>3</td>\n",
       "      <td>0.009185</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10001</td>\n",
       "      <td>19921030</td>\n",
       "      <td>11.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>39040610</td>\n",
       "      <td>36720410</td>\n",
       "      <td>-0.025000</td>\n",
       "      <td>199210</td>\n",
       "      <td>1992</td>\n",
       "      <td>10</td>\n",
       "      <td>1992</td>\n",
       "      <td>3</td>\n",
       "      <td>0.009185</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10001</td>\n",
       "      <td>19921130</td>\n",
       "      <td>11.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>39040610</td>\n",
       "      <td>36720410</td>\n",
       "      <td>-0.017094</td>\n",
       "      <td>199211</td>\n",
       "      <td>1992</td>\n",
       "      <td>11</td>\n",
       "      <td>1992</td>\n",
       "      <td>3</td>\n",
       "      <td>0.009185</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>10001</td>\n",
       "      <td>19921231</td>\n",
       "      <td>11.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>39040610</td>\n",
       "      <td>36720410</td>\n",
       "      <td>-0.015130</td>\n",
       "      <td>199212</td>\n",
       "      <td>1992</td>\n",
       "      <td>12</td>\n",
       "      <td>1992</td>\n",
       "      <td>3</td>\n",
       "      <td>0.009185</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>10001</td>\n",
       "      <td>19930129</td>\n",
       "      <td>11.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>39040610</td>\n",
       "      <td>36720410</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>199301</td>\n",
       "      <td>1993</td>\n",
       "      <td>1</td>\n",
       "      <td>1992</td>\n",
       "      <td>3</td>\n",
       "      <td>0.009185</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>10001</td>\n",
       "      <td>19930226</td>\n",
       "      <td>11.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>39040610</td>\n",
       "      <td>36720410</td>\n",
       "      <td>0.017857</td>\n",
       "      <td>199302</td>\n",
       "      <td>1993</td>\n",
       "      <td>2</td>\n",
       "      <td>1992</td>\n",
       "      <td>3</td>\n",
       "      <td>0.009185</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>10001</td>\n",
       "      <td>19930331</td>\n",
       "      <td>11.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>39040610</td>\n",
       "      <td>36720410</td>\n",
       "      <td>0.011053</td>\n",
       "      <td>199303</td>\n",
       "      <td>1993</td>\n",
       "      <td>3</td>\n",
       "      <td>1992</td>\n",
       "      <td>3</td>\n",
       "      <td>0.009185</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10001</td>\n",
       "      <td>19930430</td>\n",
       "      <td>11.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>39040610</td>\n",
       "      <td>36720410</td>\n",
       "      <td>0.070175</td>\n",
       "      <td>199304</td>\n",
       "      <td>1993</td>\n",
       "      <td>4</td>\n",
       "      <td>1992</td>\n",
       "      <td>3</td>\n",
       "      <td>0.009185</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   permno      date  shrcd  exchcd    ncusip     cusip       ret  yyyymm  \\\n",
       "0   10001  19920731   11.0     3.0  39040610  36720410  0.063830  199207   \n",
       "1   10001  19920831   11.0     3.0  39040610  36720410  0.040000  199208   \n",
       "2   10001  19920930   11.0     3.0  39040610  36720410  0.165962  199209   \n",
       "3   10001  19921030   11.0     3.0  39040610  36720410 -0.025000  199210   \n",
       "4   10001  19921130   11.0     3.0  39040610  36720410 -0.017094  199211   \n",
       "5   10001  19921231   11.0     3.0  39040610  36720410 -0.015130  199212   \n",
       "6   10001  19930129   11.0     3.0  39040610  36720410  0.000000  199301   \n",
       "7   10001  19930226   11.0     3.0  39040610  36720410  0.017857  199302   \n",
       "8   10001  19930331   11.0     3.0  39040610  36720410  0.011053  199303   \n",
       "9   10001  19930430   11.0     3.0  39040610  36720410  0.070175  199304   \n",
       "\n",
       "   calyr  month  mergeyr  numinstowners  instown_perc  \n",
       "0   1992      7     1992              3      0.009185  \n",
       "1   1992      8     1992              3      0.009185  \n",
       "2   1992      9     1992              3      0.009185  \n",
       "3   1992     10     1992              3      0.009185  \n",
       "4   1992     11     1992              3      0.009185  \n",
       "5   1992     12     1992              3      0.009185  \n",
       "6   1993      1     1992              3      0.009185  \n",
       "7   1993      2     1992              3      0.009185  \n",
       "8   1993      3     1992              3      0.009185  \n",
       "9   1993      4     1992              3      0.009185  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "msf_2 = msf.merge(tr13f,how='inner',on=['ncusip','mergeyr'])\n",
    "msf_2.head(10)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
